###########################################################################################################
# Replication for county-level analysis for West Germany presented in Table A30
# Charnysh and Schaub, Migration and Social Change: Evidence from post-WWII Displacement in Germany, JOP
# R version 4.3.1 (2023-06-16) 
###########################################################################################################


rm(list = ls())

library(openxlsx)
library(dplyr)
library(plyr)
library(sandwich)
library(lmtest)
library(stargazer)


setwd("/Users/volhacharnysh/Dropbox/Vertriebene/JOP submission/Replication")

#merging three sources of data: 
# 1. Information about refugees, religion, pre-treatment variables: 
dat<-read.xlsx("county_data/dataset19391950.xlsx") 

dat[which(dat$KREIS50=="5536000"), c("Pop1950", "Evang1950", "Kath1950", "Judische1950", "Heimatvertriebene1950", "VertrEvang1950", "VertrKath1950")]
dat$Evang1950[which(dat$KREIS50=="5536000")]<-13184  ##Correct Evang1950 for Landkreis Münster based on Gemeindestatistik Nordheim Westfalen, pages 1-23 

#1950 Kreise that need to be modified later on for merging with 1960 data: Stadtkreis Konstanz (8313000) became part of Landkreis Konstanz after 1953 (8337000)
dat$KREIS50[which(dat$KREIS50=="8313000")]<-"8337000"

cols_to_sum <- c("Heimatvertriebene1950","VertrEvang1950", "VertrKath1950","Pop1950","Evang1950","Kath1950","Judische1950", "NormalWohnungen", "KriegsschaedenNWohnungen", 
 "Population1939", "Tot1939", "Women1939","Evangelische1939", "Katholische1939","AgricultureInsgesamt1939", "Erwerbspersonen_Total1939", "AREA50")  
cols_to_avg <- c("distance_to_borderkm")
cols_unchanged <- c("ID", "Kennziffer" ,"Type", "Kreis", "Regierungsbezirk", "Land", "OccZone50")


# Aggregating data: sum, average, and keep unchanged columns grouped by 'Group'
dat19391950adj <- ddply(dat, .(KREIS50), function(subdf) {
  data.frame(
    subdf[1, cols_unchanged, drop = FALSE],  # Keep the unchanged columns up front
    lapply(subdf[cols_to_sum], function(x) if (is.numeric(x)) sum(x) else NA),  # Sum only numeric columns
    lapply(subdf[cols_to_avg], function(x) if (is.numeric(x)) mean(x) else NA)  # Average only numeric columns
  )
})
dim(dat19391950adj) #N=556

#2. Elections: 
election<-read.xlsx("county_data/election196165with1950id.xlsx")
head(election)
election<-subset(election, !is.na(KREIS50))
#We added KREIS50 and Kreis1950 from 1939-1950 file to the dataset.
#Territorial changes: Wolfsburg, Stadt was separated from Gifhorn in 1951 Has same KREIS50=3334000. Leverkusen, Stadt (separated from Rhein-Wupperkreis). Has same KREIS50=5139000
#Data for Saarland, which was not a part of Germany in 1950 should be dropped. 
cols_to_sum <- c("Wahlberechtigte1961", "Wähler1961","CDU1961","CSU1961", "Wahlberechtigte1965","Wähler1965","CDU1965","CSU1965")  
cols_unchanged <- c("ID","Kennziffer", "Type","Kreis1950","Land", "Name1961")


# Aggregating data: sum, average, and keep unchanged columns grouped by 'Group'. Rows without KREIS50 are dropped (Saarland)
el1961adj <- ddply(election, .(KREIS50), function(subdf) {
  data.frame(
    lapply(subdf[cols_to_sum], function(x) if (is.numeric(x)) sum(x) else NA)  
  )
})

# 2. Information on religious affiliation and divorce in 1961 census:
census1961<-read.xlsx("county_data/County_data_west_germany_cleaned.xlsx")
head(census1961)
census1961<-subset(census1961, !is.na(KREIS50))
census1961$Geschieden1961<-as.numeric(census1961$Geschieden1961)

#Territorial changes: Wolfsburg was separated from Gifhorn in 1951; Leverkusen was separated from Rhein-Wupperkreis in 1955
#Konstanz was joined with  Landkreis Konstanz after 1953, but this is already adjusted in dat19391950adj

cols_to_sum <- c("Pop1961","Gemeinschaftslose1961", "Verheiratet1961","Geschieden1961")  


# Aggregating data: sum, average, and keep unchanged columns grouped by 'Group'. Rows without KREIS50 are dropped (Saarland)
census1961adj <- ddply(census1961, .(KREIS50), function(subdf) {
  data.frame(
    lapply(subdf[cols_to_sum], function(x) if (is.numeric(x)) sum(x) else NA)  # Sum only numeric columns
  )
})

#N=556 with West Berlin



## Merging the data: 

dat0<-merge(dat19391950adj, census1961adj, by="KREIS50", all.x=TRUE)
dat<-merge(dat0, el1961adj, by="KREIS50", all.x=TRUE)

dim(dat) #N=556 (with W Berlin, for which a lot of data are missing so we dont actually use it in regressions)


###Compute variables used in regressions:

dat$srefugees<-dat$Heimatvertriebene1950/dat$Pop1950

dat$ShareDestroyed<-dat$KriegsschaedenNWohnungen/dat$NormalWohnungen 
dat$sfemale1939<-dat$Women1939/dat$Tot1939 #using here populaton total that came from the same file as gender for consistency (data from Braun and Franke 2021)
dat$PopDensity1939<-dat$Population1939/dat$AREA50 
dat$sprotestants1939<-dat$Evangelische1939/dat$Population1939
dat$scatholics1939<-dat$Katholische1939/dat$Population1939
dat$OtherReligion1939<-dat$Population1939-dat$Katholische1939-dat$Evangelische1939
dat$sother1939<-dat$OtherReligion1939/dat$Population1939
dat$sprotestants1950<-dat$Evang1950/dat$Pop1950
dat$scatholics1950<-dat$Kath1950/dat$Pop1950
dat$sother1950<-(dat$Pop1950-dat$Kath1950-dat$Evang1950)/dat$Pop1950

dat$relfrac1939<-1-(dat$sprotestants1939^2+dat$scatholics1939^2+dat$sother1939^2)
dat$relfrac1950<-1-(dat$sprotestants1950^2+dat$scatholics1950^2+dat$sother1950^2)

dat$lnDistEastBorder<-log(dat$distance_to_borderkm)

dat$DeltaFrac<-dat$relfrac1950-dat$relfrac1939
dat$ShareAgric1939<-dat$AgricultureInsgesamt1939/dat$Erwerbspersonen_Total1939

##Outcomes: 
dat$PtCDUCSU1961<-(dat$CSU1961+dat$CDU1961)/dat$Wahlberechtigte1961
dat$PtCDUCSU1965<-(dat$CSU1965+dat$CDU1965)/dat$Wahlberechtigte1965
dat$AvgPtCDUCSU196165<-(dat$PtCDUCSU1961+dat$PtCDUCSU1965)/2
dat$PtAffiliated1961<-1-dat$Gemeinschaftslose1961/dat$Pop1961
dat$PtDivtoMar1961<-dat$Geschieden1961/dat$Verheiratet1961

########################################################################################################################
###### Table A30: Correlations between Delta diversity and select outcomes at the county level for all of West Germany
########################################################################################################################


m1<-lm(PtAffiliated1961~DeltaFrac+relfrac1939+srefugees+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m1.c <- coeftest(m1, vcov=vcovHC(m1, type="HC1"))

m2<-lm(PtAffiliated1961~DeltaFrac+relfrac1939+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m2.c <- coeftest(m2, vcov=vcovHC(m2, type="HC1"))

m3<-lm(PtDivtoMar1961~DeltaFrac+relfrac1939+srefugees+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m3.c <- coeftest(m3, vcov=vcovHC(m3, type="HC1"))

m4<-lm(PtDivtoMar1961~DeltaFrac+relfrac1939+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m4.c <- coeftest(m4, vcov=vcovHC(m4, type="HC1"))

m5<-lm(AvgPtCDUCSU196165~DeltaFrac+relfrac1939+srefugees+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m5.c <- coeftest(m5, vcov=vcovHC(m5, type="HC1"))

m6<-lm(AvgPtCDUCSU196165~DeltaFrac+relfrac1939+ShareAgric1939+sfemale1939+ShareDestroyed+PopDensity1939+lnDistEastBorder+log(Population1939)+Land, data=dat) 
m6.c <- coeftest(m6, vcov=vcovHC(m6, type="HC1"))


means_dv <- c(rep(round(mean(dat$PtAffiliated1961,na.rm=T),2), 2),rep(round(mean(dat$PtDivtoMar1961,na.rm=T),2), 2), rep(round(mean(dat$AvgPtCDUCSU196165,na.rm=T),2), 2))

sds_dv <- c(rep(round(sd(dat$PtAffiliated1961,na.rm=T),2), 2), rep(round(sd(dat$PtDivtoMar1961,na.rm=T),2), 2),rep(round(sd(dat$AvgPtCDUCSU196165,na.rm=T),2), 2))

mean_row <- c("DV mean", means_dv)
sd_row <- c("DV sd", sds_dv)

stargazer(m1, m2, m3, m4, m5, m6,
          se = list(m1.c[, 2], m2.c[, 2], m3.c[, 2], m4.c[, 2], m5.c[, 2], m6.c[, 2]),
          p = list(m1.c[, 4], m2.c[, 4], m3.c[, 4], m4.c[, 4], m5.c[, 4], m6.c[, 4]), 
          omit.stat = c("LL", "ser", "f", "rsq"),
          omit = c("Land", "Constant"),
          covariate.labels = c("Delta diversity", "Religious fractionalization 1939", 
                               "Share expellees (1950)",
                               "Share in agriculture (1939)", 
                               "Share female (1939)",
                               "Share destroyed (1950)", 
                               "Population density (1939)", 
                               "Ln distance to Eastern border",
                               "Ln population (1939)"),
          no.space=TRUE,
          digits = 2,
          add.lines = list(mean_row, sd_row)  
)


